import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv" , sep = "," , encoding = 'utf-8')
df
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 36 | No | Travel_Frequently | 884 | Research & Development | 23 | 2 | Medical | 1 | 2061 | ... | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 39 | No | Travel_Rarely | 613 | Research & Development | 6 | 1 | Medical | 1 | 2062 | ... | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 27 | No | Travel_Rarely | 155 | Research & Development | 4 | 3 | Life Sciences | 1 | 2064 | ... | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 49 | No | Travel_Frequently | 1023 | Sales | 2 | 3 | Medical | 1 | 2065 | ... | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 34 | No | Travel_Rarely | 628 | Research & Development | 8 | 3 | Medical | 1 | 2068 | ... | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
1470 rows × 35 columns
df.shape
(1470, 35)
df[df.duplicated()==True] #no duplicates
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager |
|---|
0 rows × 35 columns
df.columns
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9) memory usage: 402.1+ KB
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 1470.0 | 36.923810 | 9.135373 | 18.0 | 30.00 | 36.0 | 43.00 | 60.0 |
| DailyRate | 1470.0 | 802.485714 | 403.509100 | 102.0 | 465.00 | 802.0 | 1157.00 | 1499.0 |
| DistanceFromHome | 1470.0 | 9.192517 | 8.106864 | 1.0 | 2.00 | 7.0 | 14.00 | 29.0 |
| Education | 1470.0 | 2.912925 | 1.024165 | 1.0 | 2.00 | 3.0 | 4.00 | 5.0 |
| EmployeeCount | 1470.0 | 1.000000 | 0.000000 | 1.0 | 1.00 | 1.0 | 1.00 | 1.0 |
| EmployeeNumber | 1470.0 | 1024.865306 | 602.024335 | 1.0 | 491.25 | 1020.5 | 1555.75 | 2068.0 |
| EnvironmentSatisfaction | 1470.0 | 2.721769 | 1.093082 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 |
| HourlyRate | 1470.0 | 65.891156 | 20.329428 | 30.0 | 48.00 | 66.0 | 83.75 | 100.0 |
| JobInvolvement | 1470.0 | 2.729932 | 0.711561 | 1.0 | 2.00 | 3.0 | 3.00 | 4.0 |
| JobLevel | 1470.0 | 2.063946 | 1.106940 | 1.0 | 1.00 | 2.0 | 3.00 | 5.0 |
| JobSatisfaction | 1470.0 | 2.728571 | 1.102846 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 |
| MonthlyIncome | 1470.0 | 6502.931293 | 4707.956783 | 1009.0 | 2911.00 | 4919.0 | 8379.00 | 19999.0 |
| MonthlyRate | 1470.0 | 14313.103401 | 7117.786044 | 2094.0 | 8047.00 | 14235.5 | 20461.50 | 26999.0 |
| NumCompaniesWorked | 1470.0 | 2.693197 | 2.498009 | 0.0 | 1.00 | 2.0 | 4.00 | 9.0 |
| PercentSalaryHike | 1470.0 | 15.209524 | 3.659938 | 11.0 | 12.00 | 14.0 | 18.00 | 25.0 |
| PerformanceRating | 1470.0 | 3.153741 | 0.360824 | 3.0 | 3.00 | 3.0 | 3.00 | 4.0 |
| RelationshipSatisfaction | 1470.0 | 2.712245 | 1.081209 | 1.0 | 2.00 | 3.0 | 4.00 | 4.0 |
| StandardHours | 1470.0 | 80.000000 | 0.000000 | 80.0 | 80.00 | 80.0 | 80.00 | 80.0 |
| StockOptionLevel | 1470.0 | 0.793878 | 0.852077 | 0.0 | 0.00 | 1.0 | 1.00 | 3.0 |
| TotalWorkingYears | 1470.0 | 11.279592 | 7.780782 | 0.0 | 6.00 | 10.0 | 15.00 | 40.0 |
| TrainingTimesLastYear | 1470.0 | 2.799320 | 1.289271 | 0.0 | 2.00 | 3.0 | 3.00 | 6.0 |
| WorkLifeBalance | 1470.0 | 2.761224 | 0.706476 | 1.0 | 2.00 | 3.0 | 3.00 | 4.0 |
| YearsAtCompany | 1470.0 | 7.008163 | 6.126525 | 0.0 | 3.00 | 5.0 | 9.00 | 40.0 |
| YearsInCurrentRole | 1470.0 | 4.229252 | 3.623137 | 0.0 | 2.00 | 3.0 | 7.00 | 18.0 |
| YearsSinceLastPromotion | 1470.0 | 2.187755 | 3.222430 | 0.0 | 0.00 | 1.0 | 3.00 | 15.0 |
| YearsWithCurrManager | 1470.0 | 4.123129 | 3.568136 | 0.0 | 2.00 | 3.0 | 7.00 | 17.0 |
df.describe(include="O")
| Attrition | BusinessTravel | Department | EducationField | Gender | JobRole | MaritalStatus | Over18 | OverTime | |
|---|---|---|---|---|---|---|---|---|---|
| count | 1470 | 1470 | 1470 | 1470 | 1470 | 1470 | 1470 | 1470 | 1470 |
| unique | 2 | 3 | 3 | 6 | 2 | 9 | 3 | 1 | 2 |
| top | No | Travel_Rarely | Research & Development | Life Sciences | Male | Sales Executive | Married | Y | No |
| freq | 1233 | 1043 | 961 | 606 | 882 | 326 | 673 | 1470 | 1054 |
#data is data frame with label encoded values
data=df.copy()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
cols = ['Attrition', 'BusinessTravel', 'Department',
'EducationField', 'Gender', 'JobRole', 'MaritalStatus',
'Over18', 'OverTime']
data[cols] = data[cols].apply(LabelEncoder().fit_transform)
data.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
import matplotlib.pyplot as plt
plt.figure(figsize=(7, 7))
# Specify the colors you want to use
colors = ['#FF6B6B', '#6BFFA6']
(data['Attrition'].value_counts()).plot.pie(autopct="%1.1f%%", colors=colors)
plt.title('Attrition percentage')
plt.show()
plt.figure(figsize=(5,5,))
(df['Gender'].value_counts()).plot.pie(autopct = "%1.1f%%", colors=['#1f6eed', '#ed1fc0'])
plt.title('Gender percentage')
Text(0.5, 1.0, 'Gender percentage')
plt.figure(figsize=(10,5))
sns.countplot(x='BusinessTravel',data=data,palette='Blues')
plt.grid(True)
plt.title('Distrubtion Of BusinessTravel',fontsize=20)
plt.xlabel('BusinessTravel',fontsize=20)
plt.ylabel('Count',fontsize=20)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()
plt.figure(figsize=(10,5))
sns.countplot(x='Department',data=data,palette='Reds')
plt.grid(True)
plt.title('Distrubtion Of Department',fontsize=20)
plt.xlabel('Department',fontsize=20)
plt.ylabel('Count',fontsize=20)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()
pd.crosstab(data.Attrition,data.Department)
| Department | 0 | 1 | 2 |
|---|---|---|---|
| Attrition | |||
| 0 | 51 | 828 | 354 |
| 1 | 12 | 133 | 92 |
Attrition=df[df['Attrition']=="Yes"]
Attrition
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 14 | 28 | Yes | Travel_Rarely | 103 | Research & Development | 24 | 3 | Life Sciences | 1 | 19 | ... | 2 | 80 | 0 | 6 | 4 | 3 | 4 | 2 | 0 | 3 |
| 21 | 36 | Yes | Travel_Rarely | 1218 | Sales | 9 | 4 | Life Sciences | 1 | 27 | ... | 2 | 80 | 0 | 10 | 4 | 3 | 5 | 3 | 0 | 3 |
| 24 | 34 | Yes | Travel_Rarely | 699 | Research & Development | 6 | 1 | Medical | 1 | 31 | ... | 3 | 80 | 0 | 8 | 2 | 3 | 4 | 2 | 1 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1438 | 23 | Yes | Travel_Frequently | 638 | Sales | 9 | 3 | Marketing | 1 | 2023 | ... | 1 | 80 | 1 | 1 | 3 | 2 | 1 | 0 | 1 | 0 |
| 1442 | 29 | Yes | Travel_Rarely | 1092 | Research & Development | 1 | 4 | Medical | 1 | 2027 | ... | 2 | 80 | 3 | 4 | 3 | 4 | 2 | 2 | 2 | 2 |
| 1444 | 56 | Yes | Travel_Rarely | 310 | Research & Development | 7 | 2 | Technical Degree | 1 | 2032 | ... | 4 | 80 | 1 | 14 | 4 | 1 | 10 | 9 | 9 | 8 |
| 1452 | 50 | Yes | Travel_Frequently | 878 | Sales | 1 | 4 | Life Sciences | 1 | 2044 | ... | 4 | 80 | 2 | 12 | 3 | 3 | 6 | 3 | 0 | 1 |
| 1461 | 50 | Yes | Travel_Rarely | 410 | Sales | 28 | 3 | Marketing | 1 | 2055 | ... | 2 | 80 | 1 | 20 | 3 | 3 | 3 | 2 | 2 | 0 |
237 rows × 35 columns
Attrition.Department.value_counts()
Research & Development 133 Sales 92 Human Resources 12 Name: Department, dtype: int64
import plotly.express as px
Department_count = Attrition['Department'].value_counts()
plt.figure(figsize=(10,10))
fig = px.pie(Attrition, values=Department_count, names=Department_count.index)
fig.update_traces(hoverinfo='label+percent', textfont_size=20,
marker=dict(colors=[' #FFB3B3', ' #C1EFFF','#FFDBA4'], line=dict(color='#fafafa', width=2)))
fig.update_traces(hole=0.5, hoverinfo="label+percent+name")
fig.update_layout(annotations=[dict(text='Department', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()
<Figure size 1000x1000 with 0 Axes>
fig = px.ecdf(df, x="MonthlyRate", color="Attrition")
fig.show()
fig = px.histogram(df, x='OverTime'
,color='Attrition', barmode="group")
fig.update_layout(template='plotly_white')
fig.show()
fig = px.box(df, x="EnvironmentSatisfaction", y='Attrition')
fig.show()
jobsat= df[(df['JobSatisfaction'] == 2) | (df['JobSatisfaction'] == 3)]
(Attrition['JobSatisfaction'].value_counts()/Attrition.shape[0]*100).plot.bar( color = ["#68228B" , '#79CDCD' , '#00E5EE', '#FCBAAD'])
plt.title('Attrition job satisfaction')
Text(0.5, 1.0, 'Attrition job satisfaction')
fig = px.box(df, x="JobSatisfaction", y='Attrition')
fig.show()
df['distance']=df["DistanceFromHome"]
df['distance']= np.where(df['distance'] <= 13, 'Near', 'Far')
df
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | distance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | ... | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 | Near |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | ... | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 | Near |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | ... | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 | Near |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | ... | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 | Near |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | ... | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 | Near |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 36 | No | Travel_Frequently | 884 | Research & Development | 23 | 2 | Medical | 1 | 2061 | ... | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 | Far |
| 1466 | 39 | No | Travel_Rarely | 613 | Research & Development | 6 | 1 | Medical | 1 | 2062 | ... | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 | Near |
| 1467 | 27 | No | Travel_Rarely | 155 | Research & Development | 4 | 3 | Life Sciences | 1 | 2064 | ... | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 | Near |
| 1468 | 49 | No | Travel_Frequently | 1023 | Sales | 2 | 3 | Medical | 1 | 2065 | ... | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 | Near |
| 1469 | 34 | No | Travel_Rarely | 628 | Research & Development | 8 | 3 | Medical | 1 | 2068 | ... | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 | Near |
1470 rows × 36 columns
sns.kdeplot(
data=df, x="RelationshipSatisfaction", hue="distance",
fill=True, common_norm=False, palette=['red','black'],
alpha=.5, linewidth=0,
)
<Axes: xlabel='RelationshipSatisfaction', ylabel='Density'>
df.Age.describe()
count 1470.000000 mean 36.923810 std 9.135373 min 18.000000 25% 30.000000 50% 36.000000 75% 43.000000 max 60.000000 Name: Age, dtype: float64
sns.stripplot(data=df, y="Age", hue="Gender",
x="Attrition",alpha=0.3, orient="v")
<Axes: xlabel='Attrition', ylabel='Age'>
g=df[df['Attrition']=='Yes']
g2=df[df['Attrition']=='No']
fig, axs = plt.subplots(2, 2, figsize=(7, 7))
sns.kdeplot(x = g['MonthlyIncome'],
fill = True, color = "blue", alpha = 0.5, ax=axs[1, 0]). set_xlabel('MonthlyIncome attrition')
sns.kdeplot(x = g2['MonthlyIncome'],
fill = True, color = "black", alpha = 0.5, ax=axs[1, 1]). set_xlabel('MonthlyIncome stay')
sns.kdeplot(x = df['MonthlyIncome'],
fill = False, alpha = 0.5, ax=axs[0, 0]). set_xlabel('MonthlyIncome total')
sns.kdeplot(x = df['MonthlyIncome'],hue=df['Attrition'],
fill = False, alpha = 0.5, ax=axs[0, 1]). set_xlabel('MonthlyIncome hue')
Text(0.5, 0, 'MonthlyIncome hue')
count_data = df.groupby(['StockOptionLevel', 'Attrition']).size().reset_index(name='Count')
px.line(count_data, x='StockOptionLevel', y='Count', color='Attrition', markers=True)
data2=data.copy()
label=LabelEncoder()
for x in data2.select_dtypes(include='object').columns:
data2[x]=label.fit_transform(data2[x])
data2.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 1 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 0 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 1 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 0 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 0 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 35 columns
plt.figure(figsize=(20,10))
sns.heatmap(data2.corr(),annot=True,fmt='.2',cbar=False,cmap='Blues_r')
data2.corr()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | -0.159205 | 0.024751 | 0.010661 | -0.031882 | -0.001686 | 0.208034 | -0.040873 | NaN | -0.010145 | ... | 0.053535 | NaN | 0.037510 | 0.680381 | -0.019621 | -0.021490 | 0.311309 | 0.212901 | 0.216513 | 0.202089 |
| Attrition | -0.159205 | 1.000000 | 0.000074 | -0.056652 | 0.063991 | 0.077924 | -0.031373 | 0.026846 | NaN | -0.010577 | ... | -0.045872 | NaN | -0.137145 | -0.171063 | -0.059478 | -0.063939 | -0.134392 | -0.160545 | -0.033019 | -0.156199 |
| BusinessTravel | 0.024751 | 0.000074 | 1.000000 | -0.004086 | -0.009044 | -0.024469 | 0.000757 | 0.023724 | NaN | -0.015578 | ... | -0.035986 | NaN | -0.016727 | 0.034226 | 0.015240 | -0.011256 | -0.014575 | -0.011497 | -0.032591 | -0.022636 |
| DailyRate | 0.010661 | -0.056652 | -0.004086 | 1.000000 | 0.007109 | -0.004985 | -0.016806 | 0.037709 | NaN | -0.050990 | ... | 0.007846 | NaN | 0.042143 | 0.014515 | 0.002453 | -0.037848 | -0.034055 | 0.009932 | -0.033229 | -0.026363 |
| Department | -0.031882 | 0.063991 | -0.009044 | 0.007109 | 1.000000 | 0.017225 | 0.007996 | 0.013720 | NaN | -0.010895 | ... | -0.022414 | NaN | -0.012193 | -0.015762 | 0.036875 | 0.026383 | 0.022920 | 0.056315 | 0.040061 | 0.034282 |
| DistanceFromHome | -0.001686 | 0.077924 | -0.024469 | -0.004985 | 0.017225 | 1.000000 | 0.021042 | 0.002013 | NaN | 0.032916 | ... | 0.006557 | NaN | 0.044872 | 0.004628 | -0.036942 | -0.026556 | 0.009508 | 0.018845 | 0.010029 | 0.014406 |
| Education | 0.208034 | -0.031373 | 0.000757 | -0.016806 | 0.007996 | 0.021042 | 1.000000 | -0.039592 | NaN | 0.042070 | ... | -0.009118 | NaN | 0.018422 | 0.148280 | -0.025100 | 0.009819 | 0.069114 | 0.060236 | 0.054254 | 0.069065 |
| EducationField | -0.040873 | 0.026846 | 0.023724 | 0.037709 | 0.013720 | 0.002013 | -0.039592 | 1.000000 | NaN | -0.002516 | ... | -0.004378 | NaN | -0.016185 | -0.027848 | 0.049195 | 0.041191 | -0.018692 | -0.010506 | 0.002326 | -0.004130 |
| EmployeeCount | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| EmployeeNumber | -0.010145 | -0.010577 | -0.015578 | -0.050990 | -0.010895 | 0.032916 | 0.042070 | -0.002516 | NaN | 1.000000 | ... | -0.069861 | NaN | 0.062227 | -0.014365 | 0.023603 | 0.010309 | -0.011240 | -0.008416 | -0.009019 | -0.009197 |
| EnvironmentSatisfaction | 0.010146 | -0.103369 | 0.004174 | 0.018355 | -0.019395 | -0.016075 | -0.027128 | 0.043163 | NaN | 0.017621 | ... | 0.007665 | NaN | 0.003432 | -0.002693 | -0.019359 | 0.027627 | 0.001458 | 0.018007 | 0.016194 | -0.004999 |
| Gender | -0.036311 | 0.029453 | -0.032981 | -0.011716 | -0.041583 | -0.001851 | -0.016547 | -0.002504 | NaN | 0.022556 | ... | 0.022868 | NaN | 0.012716 | -0.046881 | -0.038787 | -0.002753 | -0.029747 | -0.041483 | -0.026985 | -0.030599 |
| HourlyRate | 0.024287 | -0.006846 | 0.026528 | 0.023381 | -0.004144 | 0.031131 | 0.016775 | -0.021941 | NaN | 0.035179 | ... | 0.001330 | NaN | 0.050263 | -0.002334 | -0.008548 | -0.004607 | -0.019582 | -0.024106 | -0.026716 | -0.020123 |
| JobInvolvement | 0.029820 | -0.130016 | 0.039062 | 0.046135 | -0.024586 | 0.008783 | 0.042438 | -0.002655 | NaN | -0.006888 | ... | 0.034297 | NaN | 0.021523 | -0.005533 | -0.015338 | -0.014617 | -0.021355 | 0.008717 | -0.024184 | 0.025976 |
| JobLevel | 0.509604 | -0.169105 | 0.019311 | 0.002966 | 0.101963 | 0.005303 | 0.101589 | -0.044933 | NaN | -0.018519 | ... | 0.021642 | NaN | 0.013984 | 0.782208 | -0.018191 | 0.037818 | 0.534739 | 0.389447 | 0.353885 | 0.375281 |
| JobRole | -0.122427 | 0.067151 | 0.002724 | -0.009472 | 0.662431 | -0.001015 | 0.004236 | 0.015599 | NaN | -0.010336 | ... | -0.020218 | NaN | -0.019171 | -0.145439 | 0.001342 | 0.027764 | -0.083657 | -0.028354 | -0.046384 | -0.041150 |
| JobSatisfaction | -0.004892 | -0.103481 | -0.033962 | 0.030571 | 0.021001 | -0.003669 | -0.011296 | -0.034401 | NaN | -0.046247 | ... | -0.012454 | NaN | 0.010690 | -0.020185 | -0.005779 | -0.019459 | -0.003803 | -0.002305 | -0.018214 | -0.027656 |
| MaritalStatus | -0.095029 | 0.162070 | 0.024001 | -0.069586 | 0.056073 | -0.014437 | 0.004053 | 0.014420 | NaN | -0.008155 | ... | 0.022549 | NaN | -0.662577 | -0.077886 | 0.010629 | 0.014708 | -0.059986 | -0.065822 | -0.030915 | -0.038570 |
| MonthlyIncome | 0.497855 | -0.159840 | 0.034319 | 0.007707 | 0.053130 | -0.017014 | 0.094961 | -0.041070 | NaN | -0.014829 | ... | 0.025873 | NaN | 0.005408 | 0.772893 | -0.021736 | 0.030683 | 0.514285 | 0.363818 | 0.344978 | 0.344079 |
| MonthlyRate | 0.028051 | 0.015170 | -0.014107 | -0.032182 | 0.023642 | 0.027473 | -0.026084 | -0.027182 | NaN | 0.012648 | ... | -0.004085 | NaN | -0.034323 | 0.026442 | 0.001467 | 0.007963 | -0.023655 | -0.012815 | 0.001567 | -0.036746 |
| NumCompaniesWorked | 0.299635 | 0.043494 | 0.020875 | 0.038153 | -0.035882 | -0.029251 | 0.126317 | -0.008663 | NaN | -0.001251 | ... | 0.052733 | NaN | 0.030075 | 0.237639 | -0.066054 | -0.008366 | -0.118421 | -0.090754 | -0.036814 | -0.110319 |
| Over18 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| OverTime | 0.028062 | 0.246118 | 0.016543 | 0.009135 | 0.007481 | 0.025514 | -0.020322 | 0.002259 | NaN | -0.024037 | ... | 0.048493 | NaN | -0.000449 | 0.012754 | -0.079113 | -0.027092 | -0.011687 | -0.029758 | -0.012239 | -0.041586 |
| PercentSalaryHike | 0.003634 | -0.013478 | -0.029377 | 0.022704 | -0.007840 | 0.040235 | -0.011111 | -0.011214 | NaN | -0.012944 | ... | -0.040490 | NaN | 0.007528 | -0.020608 | -0.005221 | -0.003280 | -0.035991 | -0.001520 | -0.022154 | -0.011985 |
| PerformanceRating | 0.001904 | 0.002889 | -0.026341 | 0.000473 | -0.024604 | 0.027110 | -0.024539 | -0.005614 | NaN | -0.020359 | ... | -0.031351 | NaN | 0.003506 | 0.006744 | -0.015579 | 0.002572 | 0.003435 | 0.034986 | 0.017896 | 0.022827 |
| RelationshipSatisfaction | 0.053535 | -0.045872 | -0.035986 | 0.007846 | -0.022414 | 0.006557 | -0.009118 | -0.004378 | NaN | -0.069861 | ... | 1.000000 | NaN | -0.045952 | 0.024054 | 0.002497 | 0.019604 | 0.019367 | -0.015123 | 0.033493 | -0.000867 |
| StandardHours | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| StockOptionLevel | 0.037510 | -0.137145 | -0.016727 | 0.042143 | -0.012193 | 0.044872 | 0.018422 | -0.016185 | NaN | 0.062227 | ... | -0.045952 | NaN | 1.000000 | 0.010136 | 0.011274 | 0.004129 | 0.015058 | 0.050818 | 0.014352 | 0.024698 |
| TotalWorkingYears | 0.680381 | -0.171063 | 0.034226 | 0.014515 | -0.015762 | 0.004628 | 0.148280 | -0.027848 | NaN | -0.014365 | ... | 0.024054 | NaN | 0.010136 | 1.000000 | -0.035662 | 0.001008 | 0.628133 | 0.460365 | 0.404858 | 0.459188 |
| TrainingTimesLastYear | -0.019621 | -0.059478 | 0.015240 | 0.002453 | 0.036875 | -0.036942 | -0.025100 | 0.049195 | NaN | 0.023603 | ... | 0.002497 | NaN | 0.011274 | -0.035662 | 1.000000 | 0.028072 | 0.003569 | -0.005738 | -0.002067 | -0.004096 |
| WorkLifeBalance | -0.021490 | -0.063939 | -0.011256 | -0.037848 | 0.026383 | -0.026556 | 0.009819 | 0.041191 | NaN | 0.010309 | ... | 0.019604 | NaN | 0.004129 | 0.001008 | 0.028072 | 1.000000 | 0.012089 | 0.049856 | 0.008941 | 0.002759 |
| YearsAtCompany | 0.311309 | -0.134392 | -0.014575 | -0.034055 | 0.022920 | 0.009508 | 0.069114 | -0.018692 | NaN | -0.011240 | ... | 0.019367 | NaN | 0.015058 | 0.628133 | 0.003569 | 0.012089 | 1.000000 | 0.758754 | 0.618409 | 0.769212 |
| YearsInCurrentRole | 0.212901 | -0.160545 | -0.011497 | 0.009932 | 0.056315 | 0.018845 | 0.060236 | -0.010506 | NaN | -0.008416 | ... | -0.015123 | NaN | 0.050818 | 0.460365 | -0.005738 | 0.049856 | 0.758754 | 1.000000 | 0.548056 | 0.714365 |
| YearsSinceLastPromotion | 0.216513 | -0.033019 | -0.032591 | -0.033229 | 0.040061 | 0.010029 | 0.054254 | 0.002326 | NaN | -0.009019 | ... | 0.033493 | NaN | 0.014352 | 0.404858 | -0.002067 | 0.008941 | 0.618409 | 0.548056 | 1.000000 | 0.510224 |
| YearsWithCurrManager | 0.202089 | -0.156199 | -0.022636 | -0.026363 | 0.034282 | 0.014406 | 0.069065 | -0.004130 | NaN | -0.009197 | ... | -0.000867 | NaN | 0.024698 | 0.459188 | -0.004096 | 0.002759 | 0.769212 | 0.714365 | 0.510224 | 1.000000 |
35 rows × 35 columns
X=data2.drop('Attrition',axis=1)
y=data2['Attrition']
key=X.keys()
X.head()
| Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | ... | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 2 | 1102 | 2 | 1 | 2 | 1 | 1 | 1 | 2 | ... | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | 1 | 279 | 1 | 8 | 1 | 1 | 1 | 2 | 3 | ... | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | 2 | 1373 | 1 | 2 | 2 | 4 | 1 | 4 | 4 | ... | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | 1 | 1392 | 1 | 3 | 4 | 1 | 1 | 5 | 4 | ... | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | 2 | 591 | 1 | 2 | 1 | 3 | 1 | 7 | 1 | ... | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
5 rows × 34 columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle =True)
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)
X_train shape is (1176, 34) X_test shape is (294, 34) y_train shape is (1176,) y_test shape is (294,)
random=Pipeline([
('min_max',MinMaxScaler()),
('model',RandomForestClassifier(criterion = 'gini',n_estimators=100,max_depth=10,random_state=44))
])
random.fit(X_train,y_train)
Pipeline(steps=[('min_max', MinMaxScaler()),
('model',
RandomForestClassifier(max_depth=10, random_state=44))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('min_max', MinMaxScaler()),
('model',
RandomForestClassifier(max_depth=10, random_state=44))])MinMaxScaler()
RandomForestClassifier(max_depth=10, random_state=44)
print('RandomForestClassifierModel Train Score is : ' ,random.score(X_train, y_train))
print('RandomForestClassifierModel Test Score is : ' ,random.score(X_test, y_test))
RandomForestClassifierModel Train Score is : 0.9795918367346939 RandomForestClassifierModel Test Score is : 0.8707482993197279
svc=Pipeline([
('min_max',MinMaxScaler()),
('model',SVC(kernel= 'rbf',max_iter=100,C=2.0,gamma='auto'))
])
svc.fit(X_train,y_train)
Pipeline(steps=[('min_max', MinMaxScaler()),
('model', SVC(C=2.0, gamma='auto', max_iter=100))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('min_max', MinMaxScaler()),
('model', SVC(C=2.0, gamma='auto', max_iter=100))])MinMaxScaler()
SVC(C=2.0, gamma='auto', max_iter=100)
print('SVCModel Train Score is : ' ,svc.score(X_train, y_train))
print('SVCModel Test Score is : ' , svc.score(X_test, y_test))
SVCModel Train Score is : 0.5008503401360545 SVCModel Test Score is : 0.5136054421768708